# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# LOAD AND REVIEW DATA
pdata= pd.read_csv('Bank_Personal_Loan_Modelling.csv')
# GET THE SHAPE OF THE DATA TO UNDERSTAND THE SIZE
pdata.shape
# CHECK THE FIRST FEW ROWS AND COLUMNS OF THE DATA-SET
pdata.head(10)
# FROM THE TABLE BELOW, IT CAN CLEARLY BE SEEN THAT THE FOLLOWING DISTRIBUTION OF VARIABLES OCCUR:
# A VERY USEFUL METHOD TO GENERATE REPORT SUMMARY IS AS BELOW
!pip install pandas_profiling
import pandas_profiling
pdata.profile_report()
# A COMPREHENSIVE DETAILED REPORT ABOUT THE DATA IS OUTPUT
#CHECK FOR ANY NULL VALUES IN THE DATA
pdata.isnull().values.any()
#SINCE THE RESULT IS FALSE IT LOOKS LIKE THERE ARE NO NULL VALUES
# ALSO FROM THE WARNINGS WE CAN SEE THAT ZEROS ARE PRESENT ONLY IN EXPERIENCE, CCavg And Mortgage WHICH IS ALSO ACCEPTABLE AS IT
# IS A POSSIBLE SCENARIO
pdata.describe()
#JUST FOR HISTOGRAM PLOTS THE DATA IS STORED IN ANOTHER VARIABLE WITH THE OUTCOME VARIABLE and OTHER CATEGORICAL VARIABLES DROPPED
#pdata1=pdata.drop(['Personal Loan','Education','Experience','Family','Securities Account','Online','ZIP Code','CD Account','CreditCard'], axis=1)
pdata.hist(stacked=False,bins=100,figsize=(12,30),layout=(14,2));
#JUST FOR HISTOGRAM PLOTS THE DATA IS STORED IN ANOTHER VARIABLE WITH THE OUTCOME VARIABLE and OTHER CATEGORICAL VARIABLES DROPPED
pdata1=pdata.drop(['Personal Loan','Education','Experience','Family','Securities Account','Online','ZIP Code','CD Account','CreditCard'], axis=1)
pdata1.hist(stacked=False,bins=100,figsize=(12,30),layout=(14,2));
# THE NEXT STEP IS TO IDENTIFY CORRELATION IN DATA, THIS IS AGAIN DONE BY DROPPING ONLY THE OUTPUT COLUMN IN THE DATASET
pdata2=pdata.drop(['Personal Loan'], axis=1)
pdata2.corr()
# HERE IN THE TABLE WE CAN ALREADY OBSERVE NUMERICALLY THAT EXPERIENCE HAS A HIGH CORRELATION WITH AGE WITH A CORRELATION VALUE
# OF 0.99
def plot_corr(df, size=20):
corr=df.corr()
fig, ax= plt.subplots(figsize=(size,size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)),corr.columns)
plt.yticks(range(len(corr.columns)),corr.columns)
# THIS STEP IS DONE TO VISUALIZE THE CORRELAITONS IN AN GRAPHICAL MANNER
plot_corr(pdata2)
# AGAIN WE OBSERVE THAT THE EXPERIENCE IS HIGHLY CORRELATED WITH AGE, SO IN THIS CASE WE CAN DROP EXPERIENCE AND i
# PLOTTING THE COMPLETE DATASET INCLUDING CATEGORICAL VARIABLES, SHOWS TOO MUCH INFORMATION WITHOUT ANY USE
sns.pairplot(pdata,diag_kind='kde')
# PLOTTING THE COMPLETE DATASET AFTER DROPPING THE CATEGORICAL VARIABLES- LOT MORE EASIER TO UNDERSTAND
sns.pairplot(pdata1,diag_kind='kde')
# ALSO SHOWS THAT MORTGAGE HAS A SPIKE IN THE DATA SIMILAR TO AN OUTLIER
EDUCATION VS PERSONAL LOAN ( EXAMINATION OF CATEGORICAL VARIABLES ON PERSONAL LOAN)
# NOW I AM GOING TO USE THE CROSS-TAB FUNCTION IN ORDER TO EXAMINE THE DISTRIBUTION
# OF EACH CATEGORICAL VARIABLE ON THE TARGET VARIABLE , for example what percent of graduates have
#availed personal loans etc
edu=pd.crosstab(pdata['Education'],pdata['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
print('cross tabulation can be given as:','\n',edu)
FAMILY VS PERSONAL LOAN
edu=pd.crosstab(pdata['Family'],pdata['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
print('cross tabulation can be given as:','\n',edu)
# AS CAN BE SEEN THE EFFECT OF FAMILY SIZE IS NOT AS MUCH AN IMPORTANT FACTOR
#DECIDING IF THE PERSONAL LOAN IS SANCTIONED
EXAMINATION OF BOOLEAN VARIABLES ON THE TARGET VARIABLE
CD_ACCOUNT vs PERSONAL LOAN
pdata.head()
edu=pd.crosstab(pdata['CD Account'],pdata['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
print('cross tabulation can be given as:','\n',edu)
# AS CAN BE SEEN THE EFFECT OF CD ACCOUNT IS SHOWN BELOW, THOSE HAVING CD ACCOUNT
# HAVE MORE CHANCES OF AVAILING THE PERSONAL LOAN
ONLINE vs PERSONAL LOAN
edu=pd.crosstab(pdata['Online'],pdata['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
print('cross tabulation can be given as:','\n',edu)
# AS CAN BE SEEN THE EFFECT OF CREDIT CARD IS SHOWN BELOW, THOSE HAVING
# ONLINE BANKING IS NOT HAVING ANY EFFECT ON PERSONAL LOAN
Securities_Account vs Personal Loan
edu=pd.crosstab(pdata['Securities Account'],pdata['Personal Loan'])
edu.div(edu.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
print('cross tabulation can be given as:','\n',edu)
# AS CAN BE SEEN THE EFFECT OF CREDIT CARD IS SHOWN BELOW, THOSE HAVING
# Securities Account IS NOT HAVING ANY EFFECT ON PERSONAL LOAN
Numerical Variables vs Personal Data
# WE WILL TRY TO LOOK AT THE MEAN OF THE NUMERIC INDEPENDENT VARIABLE AND THEN
# COMPARE IT WITH THE TARGET VARIABLE BY USING THE GROUPBY FUNCTION
ax1=pdata.groupby('Personal Loan')['Age'].mean().plot(kind='bar')
ax1.set_ylabel('Age')
# WHAT IS HAPPENING IN THE BELOW PLOT IS THAT THE Y-AXIS CONSISTS OF THE MEAN AGE
# OF THE CUSOTMERS AND IT IS PLOTTED AGAINST THE PEOPLE WHO HAVE AND HAVE NOT AVAILED
# THE PERSONAL LOAN AND AS CAN SEEN , WE DO NOT SEE ANY MAJOR CHANGE IN MEAN AGE.
# WE MAY HAVE TO BREAK THE AGES IN TO BINS AND THEN REVIEW THE DATA AGAIN.
# BREAK THE AGE DATA SET IN TO SEVERAL BINS/CATEGORIES
#bins = [25,35,45,55,65] # from pdata.describe() we can get the range of the age of the people and then
#suitably arrange the bins
#pdata_age=pdata
#group = ['Very Young','Youth','middleage','oldage']
#pdata_age['Age_bin']=pd.cut(pdata_age['Age'],bins,labels=group)
#age=pd.crosstab(pdata_age['Age_bin'],pdata_age['Personal Loan'])
#age.div(age.sum(1).astype(float),axis=0).plot(kind='bar',stacked=True)
# As can be seen below the age distribution does not affect the personal loan distribution
# PLEASE NOTE THAT AS AGE AND EXPERIENCE ARE CLOSELY CORRELATED WE CAN EXPECT
# THE SAME RESULTS.
pdata.head()
RELATION BETWEEN CREDIT CARD USAGE AND PERSONAL LOAN
ax1=pdata.groupby('Personal Loan')['CCAvg'].mean().plot(kind='bar')
ax1.set_ylabel('Credit Card Spending')
# AS CAN BE SEEN FROM THE BAR PLOT BELOW HIGHER THE CREDIT CARD SPENDING
# MORE LIKELY IS THE PERSONAL LOAN AVAILING
INCOME VS PERSONAL LOAN
ax=pdata.groupby('Personal Loan')['Income']
ax1=ax.mean().plot(y='Income',kind='bar')
ax1.set_ylabel('Income')
# AS CAN BE SEEN FROM THE PLOT BELOW MORE PEOPLE WITH HIGHER INCOME AVAIL THE
# PERSONAL LOAN WHICH IS TO BE EXPECTED
# OUTLIER EVALUATION IN THE MORTGAGE VARIABLE
# AS WE HAD DESCRIBED IN THE PAIR PLOT AND THE DESCRIBE FUNCTION THE MORTGAGE
# VARIABLE HAS OUTLIERS IN THE DATA DISTRIBUTION
# THE APPROACH TAKEN HERE IS THAT THE ROWS WITH MORTGAGE VARIABLE WITH Z SCORE
# LESS THAN 3 IS TAKEN.
from scipy import stats
pdata['Mortgage_zscore']=np.abs(stats.zscore(pdata['Mortgage']))
pdata3=pdata[pdata['Mortgage_zscore']<3]
pdata3.drop('Mortgage_zscore',axis=1,inplace=True)
pdata3.shape
sns.pairplot(pdata3,diag_kind='kde')
# NOW NOTE THAT THE HIGHEST VALUE OF THE OUTLIER EEARLIER WAS AROUND 600 AND IT IS NOW 300 SO THE EFFECT OF THE OUTLIER IS BEING MINIMIZED
# NOW WE WILL TRY TO SEE HOW MANY PEOPLE HAVE AVAILED PERSONAL LOAN IN THE
# PRESENT DATA SET
n_personal_loan_takers=len(pdata.loc[pdata['Personal Loan']==1])
n_personal_loan_non_takers=len(pdata.loc[pdata['Personal Loan']==0])
print("The Number of people who have availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_takers,(n_personal_loan_takers/(n_personal_loan_takers + n_personal_loan_non_takers))*100 ))
print("The Number of people who have not availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_non_takers,(n_personal_loan_non_takers/(n_personal_loan_takers + n_personal_loan_non_takers))*100 ))
AS CAN BE SEEN ABOUT 10% OF THE PEOPLE HAVE TAKEN PERSONAL LOANS AND ABOUT 90% HAVE NOT AVAILED IT
# BEFORE PROCEEDING IT IS IMPORTANT TO DROP THE FOLLOWING VARIABLES FROM THE DATA SET
# 1.) EXPERIENCE- THIS VARIABLE IS SHOWING HIGH CORRELATION WITH AGE AND HENCE CAN BE DROPPED
# 2.) ID - THIS IS DATA WITH UNIQUE VALUE AND JUST REPRESENTS SERIAL NUMBER
# 3.) zIP CODE - ALSO A CATEGORICAL VARIABLE REPRESENTING A SERIAL NUMBER
# NOTE THAT WE ARE NOW USING THE DATA SET PDATA3 WHEREIN OUTLIERS ARE TAKEN CARE OF
pdata4=pdata3.drop(['Experience','ZIP Code','ID'], axis=1)
pdata4.head()
from sklearn.model_selection import train_test_split
X=pdata4.drop('Personal Loan',axis=1) # Dropping the dependent variable and keeping all the
Y= pdata4['Personal Loan'] # PREDICTED CLASS (1= LOAN AVAILED AND 0= LOAN NOT AVAILED)
x_train,x_test,y_train,y_test=train_test_split(X,Y, test_size=0.3,random_state=1)
x_train.head(10)
y_train.head(10)
# CHECKING THE SPLIT OF THE DATA
print("{0:0.2f}% data is in the training set".format((len(x_train)/len(pdata4.index))*100))
print("{0:0.2f}% data is in the testing set".format((len(x_test)/len(pdata4.index))*100))
# NOW WE HAVE TO CHECK THE SPLIT OF THE DATA ( THOSE WHO HAVE AVAILED AS COMPARED TO THOSE WHO HAVE NOT) IN THE TESTING AND
# TRAINING SET
# NOW WE WILL TRY TO SEE HOW MANY PEOPLE HAVE AVAILED PERSONAL LOAN IN THE
# PRESENT DATA SET
n_personal_loan_takers=len(pdata.loc[pdata['Personal Loan']==1])
n_personal_loan_non_takers=len(pdata.loc[pdata['Personal Loan']==0])
print("ORIGINAL DATASET- who have availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_takers,(n_personal_loan_takers/(n_personal_loan_takers + n_personal_loan_non_takers))*100 ))
print("ORIGINAL DATASET who have not availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_non_takers,(n_personal_loan_non_takers/(n_personal_loan_takers + n_personal_loan_non_takers))*100 ))
print("")
###########TRAINING DATASET#####################
n_personal_loan_takers_Training=len(y_train[y_train[:]==1])
n_personal_loan_non_takers_Training=len(y_train[y_train[:]==0])
print("TRAINING DATASET- who have availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_takers_Training,(n_personal_loan_takers_Training/(n_personal_loan_takers_Training + n_personal_loan_non_takers_Training))*100 ))
print("TRAINING DATASET who have not availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_non_takers_Training,(n_personal_loan_non_takers_Training/(n_personal_loan_takers_Training + n_personal_loan_non_takers_Training))*100 ))
print("")
###########TESTING DATASET#####################
n_personal_loan_takers_Testing=len(y_test[y_test[:]==1])
n_personal_loan_non_takers_Testing=len(y_test[y_test[:]==0])
print("TESTING DATASET- who have availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_takers_Testing,(n_personal_loan_takers_Testing/(n_personal_loan_takers_Testing + n_personal_loan_non_takers_Testing))*100 ))
print("TESTING DATASET who have not availed Personal Loans:{0} ({1:2.2f}%)".format(n_personal_loan_non_takers_Testing,(n_personal_loan_non_takers_Testing/(n_personal_loan_takers_Testing + n_personal_loan_non_takers_Testing))*100 ))
print("")
# AS CAN BE SEEN CLEARLY, THE SPLIT B/W THE LOAN TAKERS AND NON LOAN TAKERS IN THE TRAINING AND TESTING DATASET ARE ALSO KEPT
# THE SAME.
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# Fit the model on train
model= LogisticRegression(solver="liblinear")
model.fit(x_train,y_train)
#Predict on Test
y_predict = model.predict(x_test)
coef_df=pd.DataFrame(model.coef_) # converts the coefficient matrix in to a dataframe
coef_df['intercept']=model.intercept_
print(coef_df)
#EVALUATE MODEL SCORE - TEST SCORE
model_score = model.score(x_test,y_test)
print(model_score)
# The model score on test data is close to 95% which is very good.
cm=metrics.confusion_matrix(y_test,y_predict,labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
columns=[i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize=(7,5))
sns.heatmap(df_cm,annot=True)
#THE CONFUSION MATRIX FOR LOGISTIC REGRESSION IS SHOWN AS ABOVE:
#TRUE POSITIVES (TP): THE MODEL CORRECTLY PREDICTED THAT 66 PEOPLE WILL AVAIL THE PERSONAL LOAN
#TRUE NEGATIVES (TN): THE MODEL CORRECTLY PREDICTED THAT 1300 PEOPLE WILL NOT AVAIL THE PERSONAL LOAN
#FALSE POSITIVE (FP): THE MODEL PREDICTED THAT 15 PEOPLE WILL AVAIL LOAN BUT THEY DID NOT - TYPE I ERROR
#FALSE NEGATIVE (FN): THE MODEL PREDICTED THAT 62 PEOPLE WILL NOT AVAIL THE LOAN BUT THEY DID END UP AVAILING THE PERSONAL LOAN - TYPE -II ERROR ( MORE IMPORTANT IN THIS CASE)
#EVALUATION OF THE LOGISTIC REGRESSION MODEL
print("Classification Report")
print (metrics.classification_report(y_test,y_predict,labels=[1,0]))
# AS CAN BE SEEN ABOVE THE PRECISION IS ABOUT 81% AND RECALL IS ABOUT 52%
# WHICH MEANS THAT OUT OF 100 POSITIVE IDENTITIFICATION, WE CAN BE SURE THAT 81 TIMES WE GOT THE RIGHT PERSON WHO CAN AVAIL THE LOAN
# NOW WE ARE ABOUT 52% ACCURATE IN IDENTIFYING THE PEOPLE LIKELY IN AVAILING THE LOAN.
# SUPPORT FOR 1 IS 128 WHICH MEANS , 128 PEOPLE ARE LIKELY TO AVAIL THE PERSONAL LOAN
# SPLIT DATA
from sklearn.model_selection import train_test_split
X=pdata4.drop('Personal Loan',axis=1) # Dropping the dependent variable and keeping all the
Y= pdata4['Personal Loan'] # PREDICTED CLASS (1= LOAN AVAILED AND 0= LOAN NOT AVAILED)
x_train1,x_test1,y_train1,y_test1=train_test_split(X,Y, test_size=0.3,random_state=1)
x_train1.head(10)
y_train1.head(10)
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes
#create the model
Personal_Loan_Model=GaussianNB()
Personal_Loan_Model.fit(x_train1,y_train1.ravel())
PERFORMANCE OF OUR MODEL WITH TRAINING DATA
Personal_Loan_Train_Predict = Personal_Loan_Model.predict(x_train1)
from sklearn import metrics
print ("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train1,Personal_Loan_Train_Predict)))
print()
Personal_Loan_Test_Predict = Personal_Loan_Model.predict(x_test1)
from sklearn import metrics
print ("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test1,Personal_Loan_Test_Predict)))
print()
print ("Confusion Matrix")
cm=metrics.confusion_matrix(y_test1, Personal_Loan_Test_Predict, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
columns=[i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize=(7,5))
sns.heatmap(df_cm,annot=True)
#THE CONFUSION MATRIX FOR NAIVE BAIYES IS SHOWN AS ABOVE:
#TRUE POSITIVES (TP): THE MODEL CORRECTLY PREDICTED THAT 75 PEOPLE WILL AVAIL THE PERSONAL LOAN
#TRUE NEGATIVES (TN): THE MODEL CORRECTLY PREDICTED THAT 1200 PEOPLE WILL NOT AVAIL THE PERSONAL LOAN
#FALSE POSITIVE (FP): THE MODEL PREDICTED THAT 100 PEOPLE WILL AVAIL LOAN BUT THEY DID NOT - TYPE I ERROR
#FALSE NEGATIVE (FN): THE MODEL PREDICTED THAT 53 PEOPLE WILL NOT AVAIL THE LOAN BUT THEY DID END UP AVAILING THE PERSONAL LOAN - TYPE -II ERROR ( MORE IMPORTANT IN THIS CASE)
#EVALUATION OF THE NAIVE BAYES MODEL
print("Classification Report")
print (metrics.classification_report(y_test1, Personal_Loan_Test_Predict,labels=[1,0]))
# AS CAN BE SEEN ABOVE THE PRECISION IS ABOUT 42% AND RECALL IS ABOUT 59%
# WHICH MEANS THAT OUT OF 100 POSITIVE IDENTITIFICATION, WE CAN BE SURE THAT 42 TIMES WE GOT THE RIGHT PERSON WHO CAN AVAIL THE LOAN
# NOW WE ARE ABOUT 59% ACCURATE IN IDENTIFYING THE PEOPLE LIKELY IN AVAILING THE LOAN.
# SUPPORT IS 128 WHICH MEANS , 128 PEOPLE ARE LIKELY TO AVAIL THE PERSONAL LOAN, HERE WE ARE SEEING THAT RECALL SCORE HAS INCREASED
# AT THE EXPENSE OF PRECISION, SO WE SEE THAT FALSE NEGATIVES HAVE GONE DOWN WHICH IS MORE IMPORTANT.
from sklearn.model_selection import train_test_split
X=pdata4.drop('Personal Loan',axis=1) # Dropping the dependent variable and keeping all the
Y= pdata4['Personal Loan'] # PREDICTED CLASS (1= LOAN AVAILED AND 0= LOAN NOT AVAILED)
x_train2,x_test2,y_train2,y_test2=train_test_split(X,Y, test_size=0.3,random_state=1)
x_train2.head(10)
y_train2.head(10)
#BUILD kNN MODEL
from sklearn.neighbors import KNeighborsClassifier
NNH=KNeighborsClassifier(n_neighbors=5, weights ='distance')
# CALL NEAREST NEIGHBOR ALGORITHM
NNH.fit(x_train2,y_train2)
#EVALUATE THE PERFORMANCE OF kNN MODEL-test data
Personal_Loan_Test_Predict2=NNH.predict(x_test2)
NNH.score(x_test2,y_test2)
#EVALUATE THE PERFORMANCE OF kNN MODEL-train data , since it is 1 looks like some overfittig is happening
Loan_Predict2=NNH.predict(x_train2)
NNH.score(x_train2,y_train2)
print ("Confusion Matrix")
cm=metrics.confusion_matrix(y_test2, Personal_Loan_Test_Predict2, labels=[1,0])
df_cm=pd.DataFrame(cm,index=[i for i in ["1","0"]],
columns=[i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize=(7,5))
sns.heatmap(df_cm,annot=True)
#THE CONFUSION MATRIX FOR kNN IS SHOWN AS ABOVE:
#TRUE POSITIVES (TP): THE MODEL CORRECTLY PREDICTED THAT 50 PEOPLE WILL AVAIL THE PERSONAL LOAN
#TRUE NEGATIVES (TN): THE MODEL CORRECTLY PREDICTED THAT 1300 PEOPLE WILL NOT AVAIL THE PERSONAL LOAN
#FALSE POSITIVE (FP): THE MODEL PREDICTED THAT 45 PEOPLE WILL AVAIL LOAN BUT THEY DID NOT - TYPE I ERROR
#FALSE NEGATIVE (FN): THE MODEL PREDICTED THAT 78 PEOPLE WILL NOT AVAIL THE LOAN BUT THEY DID END UP AVAILING THE PERSONAL LOAN - TYPE -II ERROR ( MORE IMPORTANT IN THIS CASE)
#EVALUATION OF THE kNN MODEL
print("Classification Report")
print (metrics.classification_report(y_test2, Personal_Loan_Test_Predict2,labels=[1,0]))
# AS CAN BE SEEN ABOVE THE PRECISION IS ABOUT 53% AND RECALL IS ABOUT 39%
# WHICH MEANS THAT OUT OF 100 POSITIVE IDENTITIFICATION, WE CAN BE SURE THAT 53 TIMES WE GOT THE RIGHT PERSON WHO CAN AVAIL THE LOAN
# NOW WE ARE ABOUT 39% ACCURATE IN IDENTIFYING THE PEOPLE LIKELY IN AVAILING THE LOAN.
# SUPPORT IS 128 WHICH MEANS , 128 PEOPLE ARE LIKELY TO AVAIL THE PERSONAL LOAN, HERE WE ARE SEEING THAT RECALL SCORE HAS DECREASED
# AT THE EXPENSE OF PRECISION, SO WE SEE THAT FALSE NEGATIVES HAVE GONE DOWN WHICH IS MORE IMPORTANT.
# let us us summarize our findings:
#MODEL SCORE OF THE MODELS
#1.) LOGISTIC REGRESSION - 95% ----HIGHEST
#2.) NAIVE BAYES - 89%
#3.) kNN - 92%
# BUT THE DECISION HERE CANNOT BE BASED ONLY ON THE ACCURACY ALONE AS WE NEED TO UNDERSTAND WHICH PARAMETER SHOULD BE CHOSEN FOR EVLAUAITON
# LET US LOOK AT PRECISION VALUE HERE
#1.) LOGISTIC REGRESSION - 81%--- HIGHEST
#2.) NAIVE BAYES - 42%
#3.) kNN - 53%
# PRECISION GIVES US AN IDEA OF WHAT PROPORTIONS OF POSITIVE IDENTIFICATIONS WERE CORRECT
# LET US LOOK AT RECALL VALUE HERE
#1.) LOGISTIC REGRESSION - 52%
#2.) NAIVE BAYES - 59% -- HIGHEST
#3.) kNN - 39%
# RECALL HERE IS AN IMPORTANT MEASURE OF WHAT PROPORTIONS OF POSITIVES WERE IDENTIFIED CORRECTLY ?
# AND IN THIS CASE IT IS IMPORTANT FOR US TO MEASURE THIS AS WE CANNOT AFFORD TO LET GO OF PEOPLE WHO
# WOULD HAVE AVAILED LOAN BUT WERE NOT IDENTIFIED POSITIVELY.
# ONE FINAL PARAMETER WE WILL LOOK AT IS THE NUMBER OF TRUE POSITIVES OR THE NUMBER OF POSITIVES :
#1.) LOGISTIC REGRESSION - 66 TRUE POSITIVE CASES WERE IDENTIFIED
#2.) NAIVE BAYES - 75 TRUE POSITIVE CASES WERE IDENTIFIED
#3.) kNN - 50 TRUE POSITIVE CASES WERE IDENTIFIED
# BASED ON THE RESULTS SUMMARY ABOVE WE CAN SEE THAT EVEN THOUGH THE MODEL ACCURACY SCORE IS MORE IN LOGISTIC REGRESSION,
# THE NUMBER OF TRUE POSITIVE CASES AND RECALL RATE IS HIGHEST IN NAIVE BAYES APPROACH....
# IN THIS CASES WE NEED TO PAY MORE ATTENTION TO RECALL RATE AS WE CANNOT AFFORD TO NEGATIVELY IDENTIFY CUSTOMERS
# WHO WOULD HAVE AVAILED THE PERSONAL LOAN